In [1]:
from collections import OrderedDict
import datetime

import numpy as np
import pandas as pd

from IPython.display import HTML

import statsmodels.api as sm

from break4w.categorical import Categorical
from break4w.continous import Continous
from break4w.question import Question
from break4w.bool import Bool
from break4w.data_dictionary import DataDictionary

I'm going to try to make a data dictionary object using columns from an example data dictionary and study I worked with a while ago. I'm going to start this by assuming we can convert a text documnt to a series of dictionaries to build off of. I'm going to use the data description from the Statsmodels National Election DataSet.


In [2]:
data_ = pd.DataFrame(sm.datasets.anes96.load().data)

In [3]:
columns = [
    {
        'name': 'popul',
        'description': 'Census place population in 1000s',
        'dtype': float,
        'units': 'people',
        'magnitude': 1000,
    },
    {
        'name': 'TVnews',
        'description': 'Number of times per week that respondent watches TV news.',
        'dtype': int,
        'units': 'views per week',
        'clean_name': 'TV news',
        'limits': [0, None]
    },
    {
        'name': 'PID',
        'description': 'Party identification of respondent',
        'dtype': int,
        'order': [0, 1, 2, 3, 4, 5, 6],
        'numeric_mapping': {0: 'Strong Democrat',
                            1: 'Weak Democrat', 
                            2: 'Independent-Democrat', 
                            3: 'Independent-Indpendent', 
                            4: 'Independent-Republican', 
                            5: 'Weak Republican', 
                            6: 'Strong Republican'}
    },
    {
        'name': 'vote',
        'description': 'Individual expected to vote for Bob Dole',
        'dtype': bool,
    },
    ]
types = ['continous', 'question', 'categorical']

In [4]:
type_lookup = {'continous': Continous,
               'categorical': Categorical,
               'multiple choice': Categorical,
               'ordinal': Categorical,
               'bool': Bool,
               'boolean': Bool,
               'yes/no': Bool,
               }

In [5]:
proto_dict = OrderedDict()
for col_, type_ in zip(*(columns, types)):
    question_type = type_lookup.get(type_.lower(), Question)
    proto_dict[col_['name']] = question_type(**col_)
#         proto_dict[col_['name']] = Continous(**col_)
#     elif type_ == 'categorical':
#         proto_dict[col_['name']] = Categorical(**col_)
#     else:
#         proto_dict[col_['name']] = Question(**col_)

In [6]:
proto_dict['popul'].to_dict()


Out[6]:
('continous',
 {'name': 'popul',
  'description': 'Census place population in 1000s',
  'dtype': float,
  'clean_name': 'Popul',
  'units': 'people'})

In [7]:
type_lookup = {'continous': Continous,
               'categorical': Categorical,
               'multiple choice': Categorical,
               'ordinal': Categorical,
               'bool': Bool,
               'boolean': Bool,
               'yes/no': Bool,
               }

In [8]:
proto_dict


Out[8]:
OrderedDict([('popul', <break4w.continous.Continous at 0x11a4376a0>),
             ('TVnews', <break4w.question.Question at 0x11a437710>),
             ('PID', <break4w.categorical.Categorical at 0x11a437748>)])

In [ ]:


In [9]:
dict_ = DataDictionary(columns, types)

In [10]:
print(dict_)


Data Dictionary with 3 columns
-----------------------------------------------------------------------------
popul (Continous)
TVnews (Question)
PID (Categorical)
-----------------------------------------------------------------------------

In [11]:
df_ = dict_.to_dataframe()

In [12]:
df_


Out[12]:
description dtype type clean_name units limits numeric_mapping order
name
popul Census place population in 1000s float Continous Popul people NaN NaN NaN
TVnews Number of times per week that respondent watch... int Question TV news views per week 0 | None NaN NaN
PID Party identification of respondent int Categorical Pid NaN NaN {0: 'Strong Democrat', 1: 'Weak Democrat', 2: ... 0 | 1 | 2 | 3 | 4 | 5 | 6

In [ ]:


In [ ]:
df_

In [ ]:
test.add_question(columns[0], types[0])
test.add_question(Continous(**columns[1]))
test.add_question(columns[1], types[1])

In [ ]:
list(test.columns.keys())

In [ ]:
columns = test.columns

In [ ]:
columns

In [ ]:
del columns['popul']

In [ ]:
columns

In [ ]:
test.columns

In [ ]:
test.log

In [ ]:
test.add_question(columns[2], types[2])

In [ ]:
current = vars(test['popul'])

In [ ]:
new = {'blanks': 'not applicable',
       'frog': 'Chowder'}

In [ ]:
change_keys = {}
for k, v in new.items():
    if k in current:
        change_keys[k] = (current[k], v)
    else:
        change_keys[k] = ('add', v)
    setattr(test['popul'], k, v)

In [ ]:
test['popul'].frog

In [ ]:
check = test['popul']

In [ ]:
check.cat = 'None'

In [ ]:
check.cat

In [ ]:
test['popul'].cat

In [ ]:
vars(check)

In [ ]: